#!/usr/bin/env python
# -*- encoding: utf-8 -*-
"""
# @Description 高效读取30万新闻
# @Time : 2020/1/4 14:47 
# @Author : sky 
# @Site :  
# @File : EfficRead.py 
# @Software: PyCharm
"""

import os
import time


# 迭代器
class LoadFolders(object):
    def __init__(self, parent_path):
        self.parent_path = parent_path

    def __iter__(self):
        for file in os.listdir(self.parent_path):
            file_abspath = os.path.join(self.parent_path, file)
            if os.path.isdir(file_abspath):
                yield file_abspath


class LoadFiles(object):
    def __init__(self, parent_path):
        self.parent_path = parent_path

    def __iter__(self):
        folders = LoadFolders(self.parent_path)
        # 第一级目录
        for folder in folders:
            category = folder.split(os.sep)[-1]
            # 第二级目录
            for file in os.listdir(folder):
                yield category, file


if __name__ == '__main__':
    file_path = os.path.abspath('../dataset/CSCMNews')

    start_time = time.time()

    files = LoadFiles(file_path)
    for index, msg in enumerate(files):
        if index % 10000 == 0:
            print('{t} *** {i} \t docs has bean read'.format(t=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()),
                                                             i=index))

    end_time = time.time()

    print('Total Cost Time %.2f' % (end_time - start_time) + 's')
